bitkeeper revision 1.1236.1.180 (424d0cc0fMbtHkfJJ78Iu20ay7GbmA)
authorkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 1 Apr 2005 08:56:32 +0000 (08:56 +0000)
committerkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 1 Apr 2005 08:56:32 +0000 (08:56 +0000)
Clean up blkback data path -- each scatter-gather request maps to
a contiguous extent of a single disk (we no longer directly create
VBDs out of multiple physical extents but leave that kind of thing to
LVM). Also the 2.6 datapath creates the smallest number of bio's that
it can, to avoid unnecessary remerging in the lower block layers.
Signed-off-by: Keir Fraser <keir@xensource.com>
linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c
linux-2.6.11-xen-sparse/drivers/xen/blkback/common.h
linux-2.6.11-xen-sparse/drivers/xen/blkback/vbd.c

index a1caf0a0cf3a8e2e0cd3deae2d424a587ef1843d..4c3c8a5d06b176b1fa265e209d628d5722f71a34 100644 (file)
 #define BATCH_PER_DOMAIN 16
 
 static unsigned long mmap_vstart;
-#define MMAP_PAGES_PER_REQUEST \
-    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES             \
-    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg)                        \
-    (mmap_vstart +                                   \
-     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+#define MMAP_PAGES                                              \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                                   \
+    (mmap_vstart +                                              \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
      ((_seg) * PAGE_SIZE))
 
 /*
@@ -102,7 +100,7 @@ static void make_response(blkif_t *blkif, unsigned long id,
 
 static void fast_flush_area(int idx, int nr_pages)
 {
-    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     int               i;
 
     for ( i = 0; i < nr_pages; i++ )
@@ -384,94 +382,82 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
 {
     extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
     int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
-    short nr_sects;
-    unsigned long buffer, fas;
-    int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    unsigned long fas, remap_prot;
+    int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
     pending_req_t *pending_req;
-    unsigned long  remap_prot;
-    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
-
-    /* We map virtual scatter/gather segments to physical segments. */
-    int new_segs, nr_psegs = 0;
-    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
+    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    struct phys_req preq;
+    struct { 
+        unsigned long buf; unsigned int nsec;
+    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    unsigned int nseg;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+    struct buffer_head *bh;
+#else
+    struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int nbio = 0;
+    request_queue_t *q;
+#endif
 
     /* Check that number of segments is sane. */
-    if ( unlikely(req->nr_segments == 0) || 
-         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    nseg = req->nr_segments;
+    if ( unlikely(nseg == 0) || 
+         unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
     {
-        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+        DPRINTK("Bad number of segments in request (%d)\n", nseg);
         goto bad_descriptor;
     }
 
-    /*
-     * Check each address/size pair is sane, and convert into a
-     * physical device and block offset. Note that if the offset and size
-     * crosses a virtual extent boundary, we may end up with more
-     * physical scatter/gather segments than virtual segments.
-     */
-    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
-    {
-        fas      = req->frame_and_sects[i];
-        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
-        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
-
-        if ( nr_sects <= 0 )
-            goto bad_descriptor;
+    preq.dev           = req->device;
+    preq.sector_number = req->sector_number;
+    preq.nr_sects      = 0;
 
-        phys_seg[nr_psegs].dev           = req->device;
-        phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
-        phys_seg[nr_psegs].buffer        = buffer;
-        phys_seg[nr_psegs].nr_sects      = nr_sects;
-
-        /* Translate the request into the relevant 'physical device' */
-        new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
-        if ( new_segs < 0 )
-        { 
-            DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
-                    operation == READ ? "read" : "write", 
-                    req->sector_number + tot_sects, 
-                    req->sector_number + tot_sects + nr_sects, 
-                    req->device); 
+    for ( i = 0; i < nseg; i++ )
+    {
+        fas          = req->frame_and_sects[i];
+        seg[i].buf  = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+        seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
+        if ( seg[i].nsec <= 0 )
             goto bad_descriptor;
-        }
-  
-        nr_psegs += new_segs;
-        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
+        preq.nr_sects += seg[i].nsec;
     }
 
-    /* Nonsensical zero-sized request? */
-    if ( unlikely(nr_psegs == 0) )
+    if ( vbd_translate(&preq, blkif, operation) != 0 )
+    {
+        DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+                operation == READ ? "read" : "write", preq.sector_number,
+                preq.sector_number + preq.nr_sects, preq.dev); 
         goto bad_descriptor;
+    }
 
     if ( operation == READ )
         remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
     else
         remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
 
-    for ( i = 0; i < nr_psegs; i++ )
+    for ( i = 0; i < nseg; i++ )
     {
         mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
         mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
-        mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
+        mcl[i].args[1] = (seg[i].buf & PAGE_MASK) | remap_prot;
         mcl[i].args[2] = 0;
-#ifdef CONFIG_XEN_BLKDEV_TAP_BE
-        mcl[i].args[3] = (blkif->is_blktap) ? ID_TO_DOM(req->id) : blkif->domid;
-#else
         mcl[i].args[3] = blkif->domid;
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+        if ( blkif->is_blktap )
+            mcl[i].args[3] = ID_TO_DOM(req->id);
 #endif
         phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
-            FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT);
+            FOREIGN_FRAME(seg[i].buf >> PAGE_SHIFT);
     }
 
-    if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) )
-        BUG();
+    BUG_ON(HYPERVISOR_multicall(mcl, nseg) != 0);
 
-    for ( i = 0; i < nr_psegs; i++ )
+    for ( i = 0; i < nseg; i++ )
     {
         if ( unlikely(mcl[i].args[5] != 0) )
         {
             DPRINTK("invalid buffer -- could not remap it\n");
-            fast_flush_area(pending_idx, nr_psegs);
+            fast_flush_area(pending_idx, nseg);
             goto bad_descriptor;
         }
     }
@@ -481,19 +467,17 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
     pending_req->id        = req->id;
     pending_req->operation = operation;
     pending_req->status    = BLKIF_RSP_OKAY;
-    pending_req->nr_pages  = nr_psegs;
-    atomic_set(&pending_req->pendcnt, nr_psegs);
-    pending_cons++;
+    pending_req->nr_pages  = nseg;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
 
+    atomic_set(&pending_req->pendcnt, nseg);
+    pending_cons++;
     blkif_get(blkif);
 
-    /* Now we pass each segment down to the real blkdev layer. */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-    for ( i = 0; i < nr_psegs; i++ )
+    for ( i = 0; i < nseg; i++ )
     {
-        struct buffer_head *bh;
-
-        bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
+        bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
         if ( unlikely(bh == NULL) )
         {
             __end_block_io_op(pending_req, 0);
@@ -503,12 +487,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
         memset(bh, 0, sizeof (struct buffer_head));
 
         init_waitqueue_head(&bh->b_wait);
-        bh->b_size          = phys_seg[i].nr_sects << 9;
-        bh->b_dev           = phys_seg[i].dev;
-        bh->b_rdev          = phys_seg[i].dev;
-        bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
+        bh->b_size          = seg[i].nsec << 9;
+        bh->b_dev           = preq.dev;
+        bh->b_rdev          = preq.dev;
+        bh->b_rsector       = (unsigned long)preq.sector_number;
         bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
-            (phys_seg[i].buffer & ~PAGE_MASK);
+            (seg[i].buf & ~PAGE_MASK);
         bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
         bh->b_end_io        = end_block_io_op;
         bh->b_private       = pending_req;
@@ -522,40 +506,53 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
 
         /* Dispatch a single request. We'll flush it to disc later. */
         generic_make_request(operation, bh);
+
+        preq.sector_number += seg[i].nsec;
     }
+
 #else
-    for ( i = 0; i < nr_psegs; i++ )
-    {
-        struct bio *bio;
-        request_queue_t *q;
 
-        bio = bio_alloc(GFP_ATOMIC, 1);
-        if ( unlikely(bio == NULL) )
+    for ( i = 0; i < nseg; i++ )
+    {
+        while ( (bio == NULL) ||
+                (bio_add_page(bio,
+                              virt_to_page(MMAP_VADDR(pending_idx, i)),
+                              seg[i].nsec << 9,
+                              seg[i].buf & ~PAGE_MASK) <
+                 (seg[i].nsec << 9)) )
         {
-            __end_block_io_op(pending_req, 0);
-            continue;
+            bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
+            if ( unlikely(bio == NULL) )
+            {
+                for ( i = 0; i < (nbio-1); i++ )
+                    bio_put(biolist[i]);
+                fast_flush_area(pending_idx, nseg);
+                goto bad_descriptor;
+            }
+                
+            bio->bi_bdev    = preq.bdev;
+            bio->bi_private = pending_req;
+            bio->bi_end_io  = end_block_io_op;
+            bio->bi_sector  = preq.sector_number;
         }
 
-        bio->bi_bdev    = phys_seg[i].bdev;
-        bio->bi_private = pending_req;
-        bio->bi_end_io  = end_block_io_op;
-        bio->bi_sector  = phys_seg[i].sector_number;
+        preq.sector_number += seg[i].nsec;
+    }
 
-        bio_add_page(
-            bio,
-            virt_to_page(MMAP_VADDR(pending_idx, i)),
-            phys_seg[i].nr_sects << 9,
-            phys_seg[i].buffer & ~PAGE_MASK);
+    if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
+    {
+        flush_plugged_queue();
+        blk_get_queue(q);
+        plugged_queue = q;
+    }
 
-        if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
-        {
-            flush_plugged_queue();
-            blk_get_queue(q);
-            plugged_queue = q;
-        }
+    atomic_set(&pending_req->pendcnt, nbio);
+    pending_cons++;
+    blkif_get(blkif);
+
+    for ( i = 0; i < nbio; i++ )
+        submit_bio(operation, biolist[i]);
 
-        submit_bio(operation, bio);
-    }
 #endif
 
     return;
index a9a801fa711029fafb505c6d5e848521134015bf..a698e01c648d3992cb9ad2016716e9ab6786b598 100644 (file)
@@ -80,30 +80,19 @@ blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
             blkif_disconnect_complete(_b);        \
     } while (0)
 
-typedef struct _vbd { 
-    blkif_vdev_t   vdevice;     /* what the domain refers to this vbd as */
-    unsigned char  readonly;    /* Non-zero -> read-only */
-    unsigned char  type;        /* VDISK_TYPE_xxx */
-    blkif_pdev_t   pdevice;     /* phys device that this vbd maps to */
-    struct block_device *bdev;
-    rb_node_t      rb;          /* for linking into R-B tree lookup struct */
-} vbd_t; 
-
 void vbd_create(blkif_be_vbd_create_t *create); 
 void vbd_destroy(blkif_be_vbd_destroy_t *delete); 
 int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
 void destroy_all_vbds(blkif_t *blkif);
 
-/* Describes a [partial] disk extent (part of a block io request) */
-typedef struct {
+struct phys_req {
     unsigned short       dev;
     unsigned short       nr_sects;
     struct block_device *bdev;
-    unsigned long        buffer;
     blkif_sector_t       sector_number;
-} phys_seg_t;
+};
 
-int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
 
 void blkif_interface_init(void);
 void blkif_ctrlif_init(void);
index 2a67780755d7c5e147bfe602fcaee3b0662d55f0..f5ee589ee31c4fce99a82bd96e7cb8b2610651dc 100644 (file)
 
 #include "common.h"
 
+struct vbd { 
+    blkif_vdev_t   vdevice;     /* what the domain refers to this vbd as */
+    unsigned char  readonly;    /* Non-zero -> read-only */
+    unsigned char  type;        /* VDISK_TYPE_xxx */
+    blkif_pdev_t   pdevice;     /* phys device that this vbd maps to */
+    struct block_device *bdev;
+    rb_node_t      rb;          /* for linking into R-B tree lookup struct */
+}; 
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 static inline dev_t vbd_map_devnum(blkif_pdev_t cookie)
 { return MKDEV(cookie>>8, cookie&0xff); }
@@ -25,7 +34,7 @@ static inline dev_t vbd_map_devnum(blkif_pdev_t cookie)
 
 void vbd_create(blkif_be_vbd_create_t *create) 
 {
-    vbd_t       *vbd; 
+    struct vbd  *vbd; 
     rb_node_t  **rb_p, *rb_parent = NULL;
     blkif_t     *blkif;
     blkif_vdev_t vdevice = create->vdevice;
@@ -43,7 +52,7 @@ void vbd_create(blkif_be_vbd_create_t *create)
     while ( *rb_p != NULL )
     {
         rb_parent = *rb_p;
-        vbd = rb_entry(rb_parent, vbd_t, rb);
+        vbd = rb_entry(rb_parent, struct vbd, rb);
         if ( vdevice < vbd->vdevice )
         {
             rb_p = &rb_parent->rb_left;
@@ -60,7 +69,7 @@ void vbd_create(blkif_be_vbd_create_t *create)
         }
     }
 
-    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+    if ( unlikely((vbd = kmalloc(sizeof(struct vbd), GFP_KERNEL)) == NULL) )
     {
         DPRINTK("vbd_create: out of memory\n");
         create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
@@ -115,7 +124,7 @@ void vbd_create(blkif_be_vbd_create_t *create)
 void vbd_destroy(blkif_be_vbd_destroy_t *destroy) 
 {
     blkif_t           *blkif;
-    vbd_t             *vbd;
+    struct vbd        *vbd;
     rb_node_t         *rb;
     blkif_vdev_t       vdevice = destroy->vdevice;
 
@@ -131,7 +140,7 @@ void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
     rb = blkif->vbd_rb.rb_node;
     while ( rb != NULL )
     {
-        vbd = rb_entry(rb, vbd_t, rb);
+        vbd = rb_entry(rb, struct vbd, rb);
         if ( vdevice < vbd->vdevice )
             rb = rb->rb_left;
         else if ( vdevice > vbd->vdevice )
@@ -154,14 +163,14 @@ void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
 
 void destroy_all_vbds(blkif_t *blkif)
 {
-    vbd_t             *vbd;
-    rb_node_t         *rb;
+    struct vbd *vbd;
+    rb_node_t  *rb;
 
     spin_lock(&blkif->vbd_lock);
 
     while ( (rb = blkif->vbd_rb.rb_node) != NULL )
     {
-        vbd = rb_entry(rb, vbd_t, rb);
+        vbd = rb_entry(rb, struct vbd, rb);
         rb_erase(rb, &blkif->vbd_rb);
         spin_unlock(&blkif->vbd_lock);
         bdev_put(vbd->bdev);
@@ -173,7 +182,8 @@ void destroy_all_vbds(blkif_t *blkif)
 }
 
 
-static void vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
+static void vbd_probe_single(
+    blkif_t *blkif, vdisk_t *vbd_info, struct vbd *vbd)
 {
     vbd_info->device   = vbd->vdevice; 
     vbd_info->info     = vbd->type | (vbd->readonly ? VDISK_FLAG_RO : 0);
@@ -199,7 +209,8 @@ int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
     for ( ; ; )
     {
         /* STEP 2. Dealt with left subtree. Now process current node. */
-        vbd_probe_single(blkif, &vbd_info[nr_vbds], rb_entry(rb, vbd_t, rb));
+        vbd_probe_single(blkif, &vbd_info[nr_vbds],
+                         rb_entry(rb, struct vbd, rb));
         if ( ++nr_vbds == max_vbds )
             goto out;
 
@@ -232,11 +243,11 @@ int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
 }
 
 
-int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
 {
-    vbd_t     *vbd;
-    rb_node_t *rb;
-    int        rc = -EACCES;
+    struct vbd *vbd;
+    rb_node_t  *rb;
+    int         rc = -EACCES;
 
     /* Take the vbd_lock because another thread could be updating the tree. */
     spin_lock(&blkif->vbd_lock);
@@ -244,10 +255,10 @@ int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
     rb = blkif->vbd_rb.rb_node;
     while ( rb != NULL )
     {
-        vbd = rb_entry(rb, vbd_t, rb);
-        if ( pseg->dev < vbd->vdevice )
+        vbd = rb_entry(rb, struct vbd, rb);
+        if ( req->dev < vbd->vdevice )
             rb = rb->rb_left;
-        else if ( pseg->dev > vbd->vdevice )
+        else if ( req->dev > vbd->vdevice )
             rb = rb->rb_right;
         else
             goto found;
@@ -263,12 +274,12 @@ int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
     if ( (operation == WRITE) && vbd->readonly )
         goto out;
 
-    if ( unlikely((pseg->sector_number + pseg->nr_sects) > vbd_sz(vbd)) )
+    if ( unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)) )
         goto out;
 
-    pseg->dev  = vbd->pdevice;
-    pseg->bdev = vbd->bdev;
-    rc = 1;
+    req->dev  = vbd->pdevice;
+    req->bdev = vbd->bdev;
+    rc = 0;
 
  out:
     spin_unlock(&blkif->vbd_lock);